K-MEANS clusteringΒΆ
Run with:
bin/spark-submit examples/src/main/python/ml/kmeans_example.py
This example requires NumPy (http://www.numpy.org/)
from __future__ import print_function
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("KMeansExample")\
.getOrCreate()
# Loads data
dataset = spark.read.format("libsvm").load("/opt/spark/data/mllib/sample_kmeans_data.txt")
# Trains a K-Means model
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)
# Make Predictions
predictions = model.transform(dataset)
# Evaluate Clustering by computing Silhouette Score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with Squared Euclidean Distance = " + str(silhouette))
# Shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
print(center)
spark.stop()
# Silhouette with
# squared euclidean distance = 0.9997530305375207
# Cluster Centers:
# [0.1 0.1 0.1]
# [9.1 9.1 9.1]